This notebook explores the use of SingleR
to perform cell-type annotation on datasets from the ScPCA project
SCPCP000007 (Gawad lab data), which also features CITE-Seq ADT counts
for validation.
Note: The first time you run this code it may take a few more minutes due to reference downloads, but they will be cached for faster future execution.
# load the R project
project_root <- here::here()
renv::load(project_root)
suppressPackageStartupMessages({
library(SingleCellExperiment)
library(ggplot2)
})
theme_set(theme_bw())
set.seed(params$seed) # unclear if this is doing anything? probably not. but maybe later!
utils_dir <- file.path(project_root, "scripts", "utils")
source(file.path(utils_dir, "integration-helpers.R"))
sce_file_suffix <- "processed_citeseq.rds"
integrated_sce_file <- file.path(project_root,
params$integrated_sce_dir,
paste0(params$project_id, "_integrated_", params$integration_method, "_sce.rds")
)
if (!file.exists(integrated_sce_file)){
stop("Integrated SCE file could not be found.")
}
Read in the data:
library_metadata_df <- readr::read_tsv(file.path(project_root, params$library_file))
Rows: 25 Columns: 16
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (15): project_name, submitter, library_biomaterial_id, sample_biomateria...
lgl (1): has_CITEseq
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
integrated_sce <- readr::read_rds(integrated_sce_file)
# Define the unintegrated SCE filenames
sce_file_paths <- library_metadata_df %>%
dplyr::filter(project_name == params$project_id) %>%
dplyr::mutate(sce_file_path = file.path(
project_root,
integration_input_dir,
sample_biomaterial_id,
glue::glue("{library_biomaterial_id}_{sce_file_suffix}")
)) %>%
dplyr::pull(sce_file_path)
SingleR
annotationHere we perform celltype annotation with the given reference in
params$reference on each of the Gawad libraries and look at
their UMAPs colored by celltype.
First, set the reference:
if (params$reference == "hpca") {
ref_data <- celldex::HumanPrimaryCellAtlasData(ensembl = TRUE)
} else if (params$reference == "blueprint_encode") {
ref_data <- celldex::BlueprintEncodeData(ensembl = TRUE)
} else {
stop("Bad reference parameter; either 'hpca' or 'blueprint_encode'.")
}
snapshotDate(): 2022-04-26
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
snapshotDate(): 2022-04-25
loading from cache
require("ensembldb")
Warning: Unable to map 1470 of 19363 requested IDs.
Define some functions:
annotate_SingleR <- function(sce, ref_data, label_name = "label.main") {
# label_name is either "label.main" or "label.fine" to label with reference
# broad or fine-grained celltypes, respectively
#label_name <- "label.fine"
# return updated sce and the predictions themselves
preds <- SingleR::SingleR(
test = sce,
ref = ref_data,
labels = colData(ref_data)[,label_name],
BPPARAM = BiocParallel::MulticoreParam(params$cores)
)
return(preds)
}
plot_SingleR_heatmap <- function(SingleR_annotations) {
# SingleR_annotations: result object from running SingleR::SingleR
# Make and print a heatmap
heatmap <- SingleR::plotScoreHeatmap(SingleR_annotations,
# this is default but let's be explicit
show.pruned = FALSE)
print(heatmap)
}
plot_SingleR_UMAP <- function(sce, colname, legend_ncol = 1) {
# sce: SCE object containing a column `colname` with celltype predictions
# colname: Name of column with celltypes
# Ensure this is a factor
colData(sce)[,colname] <- as.factor(colData(sce)[,colname])
umap_df <- tibble::as_tibble(reducedDim(sce, "UMAP")) %>%
dplyr::select(UMAP1 = V1, UMAP2 = V2) %>%
dplyr::mutate(celltypes = colData(sce)[,colname])
plot_colors <- rainbow( length(levels(umap_df$celltypes)) )
umap <- ggplot(umap_df) +
aes(x = UMAP1, y = UMAP2, color = celltypes) +
geom_point(size = 0.2, alpha = 0.5) +
scale_color_manual(values = plot_colors) +
guides(color = guide_legend(override.aes = list(size=3),
ncol = legend_ncol)) +
ggtitle(glue::glue("Library UMAP with projected SingleR celltype annotations"))
print(umap)
}
# Function to run and optionally plot SingleR
# Return SCE with annotations `SingleR_annotations` and `SingleR_annotations_collapsed` columns
run_SingleR <- function(sce,
max_celltypes = params$max_celltypes,
viz = TRUE) {
# Print out the library
print(unique( metadata(sce)$library ))
# Run SingleR
SingleR_results <- annotate_SingleR(sce, ref_data)
# Add annotations into sce as a frequency-ordered factor
sce$SingleR_annotations <- forcats::fct_infreq(SingleR_results$pruned.labels)
# Create an additional column `SingleR_annotations_collapsed` with only max_celltypes
# This will automatically create an "Other" group with the remaining non-top-max_celltypes celltypes
sce$SingleR_annotations_collapsed <- forcats::fct_lump_n(
sce$SingleR_annotations,
max_celltypes
)
# plot if TRUE
if (viz) {
# Plot heatmap
plot_SingleR_heatmap(SingleR_results)
# Plot two UMAPs: with all celltypes and with max_celltypes
plot_SingleR_UMAP(sce, "SingleR_annotations", legend_ncol=2)
plot_SingleR_UMAP(sce, "SingleR_annotations_collapsed")
}
# Return the updated SCE object
return(sce)
}
Here we go!
# Read in all SCE files
sce_list <- purrr::map(
sce_file_paths,
readr::read_rds
)
# Annotate them all, popping out some viz along the way if specified
sce_list_annotated <- purrr::map(sce_list, run_SingleR, viz = TRUE)
[1] "SCPCL000295"
Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
Using compatibility `.name_repair`.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
[1] "SCPCL000296"
[1] "SCPCL000297"
[1] "SCPCL000298"
[1] "SCPCL000299"
# Remove `ref_data` and `sce_list` which are large
rm(ref_data)
rm(sce_list)
This section has some UMAPs:
CD123 (leukemia marker) and CD3 (T-cell
marker)# First let's create a df of celltypes and ADTs:
extract_celltypes_adts <- function(sce) {
# data frame of ADT counts
adt_df <- logcounts(altExp(sce)) %>%
as.matrix() %>%
t() %>%
tibble::as_tibble(rownames = "cell_barcode") %>%
tidyr::pivot_longer(dplyr::starts_with("CD"),
names_to = "ADT",
values_to = "logcounts")
# Combine with data frame of annotations
adt_df_anno <- tibble::tibble(
celltype = sce$SingleR_annotations,
cell_barcode = rownames(colData(sce)),
library = metadata(sce)$library
) %>%
dplyr::inner_join(adt_df) %>%
dplyr::mutate(cell_name = paste(cell_barcode, library, sep = "-"))
}
# This df is generally used below:
celltype_adt_df <- purrr::map_df(sce_list_annotated, extract_celltypes_adts) %>%
# And join in UMAPs:
dplyr::inner_join(
reducedDim(integrated_sce, paste0(params$integration_method, "_UMAP")) %>%
tibble::as_tibble(rownames = "cell_name") %>%
dplyr::rename(UMAP1 = V1, UMAP2 = V2)
) %>%
dplyr::select(-cell_barcode) %>%
# remake `celltype_collapsed` so that it's the top params$max_celltypes
# considering all libraries pooled
dplyr::mutate(celltype_collapsed = forcats::fct_lump(celltype, params$max_celltypes))
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
Joining, by = "cell_name"
# We can now remove the large `sce_list_annotated` and `integrated_sce`
rm(sce_list_annotated)
rm(integrated_sce)
# See df:
celltype_adt_df
# RNA UMAP:
all_celltypes_rna_umap <- celltype_adt_df %>%
dplyr::select(UMAP1, UMAP2, celltype) %>%
dplyr::distinct() %>%
ggplot() +
aes(x = UMAP1, y = UMAP2, color = celltype) +
geom_point(size = 0.3, alpha = 0.4) +
guides(color = guide_legend(override.aes = list(size=3))) +
ggtitle(glue::glue("Integrated UMAP with all SingleR-annotated celltypes shown"))
all_celltypes_rna_umap
all_celltypes_rna_umap +
lims(
x = c(-6, 4),
y = c(-5, 5.5)
) +
ggtitle("The above UMAP, zoomed in")
Warning: Removed 487 rows containing missing values (geom_point).
# RNA UMAP:
max_celltypes_rna_umap <- celltype_adt_df %>%
dplyr::select(UMAP1, UMAP2, celltype_collapsed) %>%
dplyr::distinct() %>%
ggplot() +
aes(x = UMAP1, y = UMAP2, color = celltype_collapsed) +
geom_point(size = 0.3, alpha = 0.4) +
guides(color = guide_legend(override.aes = list(size=3))) +
labs(
title = glue::glue("Integrated UMAP with top {params$max_celltypes} SingleR-annotated celltypes shown"),
color = glue::glue("Top {params$max_celltypes} celltypes"))
max_celltypes_rna_umap
max_celltypes_rna_umap +
lims(
x = c(-6, 4),
y = c(-5, 5.5)
) +
ggtitle("The above UMAP, zoomed in")
Warning: Removed 487 rows containing missing values (geom_point).
# function to make UMAPs colored by ADT
make_adt_umap <- function(celltype_adt_df, adt_name) {
celltype_adt_df %>%
dplyr::filter(ADT == adt_name) %>%
ggplot() +
aes(x = UMAP1, y = UMAP2, color = logcounts) +
geom_point(size = 0.3, alpha = 0.4) +
scale_color_viridis_c() +
labs(
title = glue::glue("{adt_name} expression"),
color = "Normalized expression")
}
make_adt_umap(celltype_adt_df, "CD3") # bottom left corner tiny dots!
make_adt_umap(celltype_adt_df, "CD123") # everything is tumor
make_adt_umap(celltype_adt_df, "CD45") # should be on all differentiated cells more or less\
make_adt_umap(celltype_adt_df, "CD19") # b cells
This section explores ADT counts across predicted celltypes.
adt_celltype_violin <- function(df, celltype_column, adt_name) {
# colors, using dplyr since tidyeval land
fill_vector <- df %>%
dplyr::pull({{celltype_column}}) %>%
unique() %>%
length() %>%
rainbow()
celltype_adt_df %>%
dplyr::filter(ADT == adt_name) %>%
dplyr::mutate(celltype_plot_column = forcats::fct_reorder({{celltype_column}}, logcounts)) %>%
ggplot() +
aes(x = celltype_plot_column,
y = logcounts,
fill = celltype_plot_column) +
geom_violin(alpha = 0.7) +
stat_summary() +
scale_fill_manual(values = fill_vector) +
labs(
title = glue::glue("{adt_name} normalized expression across celltypes"),
x = "Celltypes",
y = "Normalized expression"
) +
theme(legend.position = "none",
axis.text.x = element_text(angle = 30,
hjust = 1,
size = rel(0.8)))
}
## CD3
adt_celltype_violin(celltype_adt_df, celltype, "CD3")
Warning: Groups with fewer than two data points have been dropped.
Groups with fewer than two data points have been dropped.
Groups with fewer than two data points have been dropped.
Groups with fewer than two data points have been dropped.
No summary function supplied, defaulting to `mean_se()`
Warning: Removed 4 rows containing missing values (geom_segment).
adt_celltype_violin(celltype_adt_df, celltype_collapsed, "CD3")
No summary function supplied, defaulting to `mean_se()`
## CD19
adt_celltype_violin(celltype_adt_df, celltype, "CD19")
Warning: Groups with fewer than two data points have been dropped.
Groups with fewer than two data points have been dropped.
Groups with fewer than two data points have been dropped.
Groups with fewer than two data points have been dropped.
No summary function supplied, defaulting to `mean_se()`
Warning: Removed 4 rows containing missing values (geom_segment).
adt_celltype_violin(celltype_adt_df, celltype_collapsed, "CD19")
No summary function supplied, defaulting to `mean_se()`
## CD123
adt_celltype_violin(celltype_adt_df, celltype, "CD123")
Warning: Groups with fewer than two data points have been dropped.
Groups with fewer than two data points have been dropped.
Groups with fewer than two data points have been dropped.
Groups with fewer than two data points have been dropped.
No summary function supplied, defaulting to `mean_se()`
Warning: Removed 4 rows containing missing values (geom_segment).
adt_celltype_violin(celltype_adt_df, celltype_collapsed, "CD123")
No summary function supplied, defaulting to `mean_se()`
To build a sense of how different references perform, let’s just look at one SCE (arbitrarily chosen as the first):
sce <- readr::read_rds(sce_file_paths[[1]])
Functions for comparing references:
# Return a tibble of predictions from the two references
# Define outside of function to avoid rerunning over and over and over AND OVER.
hpca_ref <- celldex::HumanPrimaryCellAtlasData(ensembl = TRUE)
snapshotDate(): 2022-04-26
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
snapshotDate(): 2022-04-25
loading from cache
Warning: Unable to map 1470 of 19363 requested IDs.
blueprint_encode_ref <- celldex::BlueprintEncodeData(ensembl = TRUE)
snapshotDate(): 2022-04-26
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
see ?celldex and browseVignettes('celldex') for documentation
loading from cache
snapshotDate(): 2022-04-25
loading from cache
Warning: Unable to map 532 of 19859 requested IDs.
compare_predictions <- function(sce,
label_name) {
# HPCA prediction
preds_hpca <- SingleR::SingleR(
test = sce,
ref = hpca_ref,
labels = colData(hpca_ref)[,label_name],
BPPARAM = BiocParallel::MulticoreParam(params$cores)
)
# Blueprint/ENCODE prediction
preds_be <- SingleR::SingleR(
test = sce,
ref = blueprint_encode_ref,
labels = colData(blueprint_encode_ref)[,label_name],
BPPARAM = BiocParallel::MulticoreParam(params$cores)
)
# Combine results into a wide tibble:
preds_df <- tibble::tibble(
hpca = preds_hpca$labels,
cell_barcode = rownames(preds_hpca)) %>%
dplyr::inner_join(
tibble::tibble(
blueprint_encode = preds_be$labels,
cell_barcode = rownames(preds_be))
)
return(preds_df)
}
The celldex
package contains bulk RNA-Seq datasets for use as reference. These two
are likely most useful:
The HPCA reference consists of publicly available microarray datasets derived from human primary cells (Mabbott et al. 2013). Most of the labels refer to blood subpopulations but cell types from other tissues are also available.
The Blueprint/ENCODE reference consists of bulk RNA-seq data for pure stroma and immune cells generated by Blueprint (Martens and Stunnenberg 2013) and ENCODE projects (The ENCODE Project Consortium 2012).
main_labels_df <- compare_predictions(sce, "label.main")
Joining, by = "cell_barcode"
fine_labels_df <- compare_predictions(sce, "label.fine")
Joining, by = "cell_barcode"
# remove refs which are large
rm(hpca_ref)
rm(blueprint_encode_ref)
How do these predictions compare? Note that these references often use different labels for the same cell type so we shouldn’t expect a perfect diagonal below:
#function for making comparison heatmaps
make_comparison_heatmap <- function(df) {
df %>%
dplyr::mutate(
hpca = forcats::fct_rev(forcats::fct_infreq(hpca)),
blueprint_encode = forcats::fct_rev(forcats::fct_infreq(blueprint_encode))
) %>%
dplyr::count(hpca, blueprint_encode) %>%
ggplot() +
aes(x = blueprint_encode, y = hpca, fill = n) +
geom_tile(color = "black") +
geom_text(aes(label = n), size = rel(1.2)) +
scale_fill_distiller(palette = "RdYlBu", name = "Num. cells") +
# background grid from theme_bw does not match up with rectangles.
theme_classic() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
}
make_comparison_heatmap(main_labels_df) + ggtitle("Broad label comparison")
make_comparison_heatmap(fine_labels_df) + ggtitle("Fine label comparison")
sessionInfo()
R version 4.2.1 (2022-06-23)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Big Sur ... 10.16
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] ensembldb_2.20.2 AnnotationFilter_1.20.0
[3] GenomicFeatures_1.48.3 AnnotationDbi_1.58.0
[5] celldex_1.6.0 magrittr_2.0.3
[7] ggplot2_3.3.6 SingleCellExperiment_1.18.0
[9] SummarizedExperiment_1.26.1 Biobase_2.56.0
[11] GenomicRanges_1.48.0 GenomeInfoDb_1.32.3
[13] IRanges_2.30.0 S4Vectors_0.34.0
[15] BiocGenerics_0.42.0 MatrixGenerics_1.8.1
[17] matrixStats_0.62.0
loaded via a namespace (and not attached):
[1] AnnotationHub_3.4.0 BiocFileCache_2.4.0
[3] lazyeval_0.2.2 splines_4.2.1
[5] BiocParallel_1.30.3 digest_0.6.30
[7] htmltools_0.5.3 viridis_0.6.2
[9] fansi_1.0.3 memoise_2.0.1
[11] ScaledMatrix_1.4.0 tzdb_0.3.0
[13] Biostrings_2.64.0 miQC_1.4.0
[15] readr_2.1.2 vroom_1.5.7
[17] prettyunits_1.1.1 colorspace_2.0-3
[19] blob_1.2.3 rappdirs_0.3.3
[21] xfun_0.34 dplyr_1.0.9
[23] crayon_1.5.1 RCurl_1.98-1.8
[25] jsonlite_1.8.3 glue_1.6.2
[27] gtable_0.3.0 zlibbioc_1.42.0
[29] XVector_0.36.0 DelayedArray_0.22.0
[31] BiocSingular_1.12.0 scales_1.2.0
[33] pheatmap_1.0.12 DBI_1.1.3
[35] Rcpp_1.0.9 viridisLite_0.4.0
[37] xtable_1.8-4 progress_1.2.2
[39] bit_4.0.4 rsvd_1.0.5
[41] httr_1.4.3 RColorBrewer_1.1-3
[43] modeltools_0.2-23 ellipsis_0.3.2
[45] pkgconfig_2.0.3 XML_3.99-0.10
[47] flexmix_2.3-18 farver_2.1.1
[49] nnet_7.3-17 sass_0.4.2
[51] dbplyr_2.2.1 utf8_1.2.2
[53] here_1.0.1 tidyselect_1.1.2
[55] labeling_0.4.2 rlang_1.0.6
[57] later_1.3.0 munsell_0.5.0
[59] BiocVersion_3.15.2 tools_4.2.1
[61] cachem_1.0.6 cli_3.4.1
[63] generics_0.1.3 RSQLite_2.2.15
[65] ExperimentHub_2.4.0 evaluate_0.18
[67] stringr_1.4.1 fastmap_1.1.0
[69] yaml_2.3.6 knitr_1.40
[71] bit64_4.0.5 purrr_0.3.4
[73] KEGGREST_1.36.3 sparseMatrixStats_1.8.0
[75] mime_0.12 xml2_1.3.3
[77] biomaRt_2.52.0 compiler_4.2.1
[79] filelock_1.0.2 curl_4.3.2
[81] png_0.1-7 interactiveDisplayBase_1.34.0
[83] tibble_3.1.8 bslib_0.4.1
[85] stringi_1.7.8 highr_0.9
[87] forcats_0.5.2 lattice_0.20-45
[89] ProtGenerics_1.28.0 Matrix_1.4-1
[91] vctrs_0.4.1 pillar_1.8.0
[93] lifecycle_1.0.1 BiocManager_1.30.18
[95] jquerylib_0.1.4 BiocNeighbors_1.14.0
[97] bitops_1.0-7 irlba_2.3.5
[99] httpuv_1.6.5 rtracklayer_1.56.1
[101] R6_2.5.1 BiocIO_1.6.0
[103] promises_1.2.0.1 renv_0.16.0
[105] gridExtra_2.3 SingleR_1.10.0
[107] codetools_0.2-18 assertthat_0.2.1
[109] rprojroot_2.0.3 rjson_0.2.21
[111] withr_2.5.0 GenomicAlignments_1.32.1
[113] Rsamtools_2.12.0 GenomeInfoDbData_1.2.8
[115] parallel_4.2.1 hms_1.1.1
[117] grid_4.2.1 beachmat_2.12.0
[119] tidyr_1.2.0 rmarkdown_2.18
[121] DelayedMatrixStats_1.18.0 shiny_1.7.2
[123] restfulr_0.0.15